library(tidyverse)
library(tidymodels)
library(modeltime)
library(sf)
library(leaflet)

NYPD_SHOOTING_URL <- "https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv?accessType=DOWNLOAD"

Importing Data

Below, we import the shooting dataset.

shooting_raw_df <- 
    NYPD_SHOOTING_URL %>% 
    read_csv(
        col_types = cols(
            INCIDENT_KEY            = col_integer(),
            OCCUR_DATE              = col_date(format = "%m/%d/%Y"),
            OCCUR_TIME              = col_time(format = "%H:%M:%S"),
            PRECINCT                = col_integer(),
            JURISDICTION_CODE       = col_integer(),
            STATISTICAL_MURDER_FLAG = col_logical(),
            X_COORD_CD              = col_double(),
            Y_COORD_CD              = col_double(),
            Latitude                = col_double(),
            Longitude               = col_double(),
            .default                = col_character()
        )
    ) %>% 
    janitor::clean_names()

Data Cleaning

We start with a summary of the dataset:

shooting_raw_df %>% 
    summary()
##   incident_key         occur_date          occur_time           boro          
##  Min.   :  9953245   Min.   :2006-01-01   Length:28562      Length:28562      
##  1st Qu.: 65439914   1st Qu.:2009-09-04   Class1:hms        Class :character  
##  Median : 92711254   Median :2013-09-20   Class2:difftime   Mode  :character  
##  Mean   :127405824   Mean   :2014-06-07   Mode  :numeric                      
##  3rd Qu.:203131993   3rd Qu.:2019-09-29                                       
##  Max.   :279758069   Max.   :2023-12-29                                       
##                                                                               
##  loc_of_occur_desc     precinct     jurisdiction_code loc_classfctn_desc
##  Length:28562       Min.   :  1.0   Min.   :0.0000    Length:28562      
##  Class :character   1st Qu.: 44.0   1st Qu.:0.0000    Class :character  
##  Mode  :character   Median : 67.0   Median :0.0000    Mode  :character  
##                     Mean   : 65.5   Mean   :0.3219                      
##                     3rd Qu.: 81.0   3rd Qu.:0.0000                      
##                     Max.   :123.0   Max.   :2.0000                      
##                                     NA's   :2                           
##  location_desc      statistical_murder_flag perp_age_group    
##  Length:28562       Mode :logical           Length:28562      
##  Class :character   FALSE:23036             Class :character  
##  Mode  :character   TRUE :5526              Mode  :character  
##                                                               
##                                                               
##                                                               
##                                                               
##    perp_sex          perp_race         vic_age_group        vic_sex         
##  Length:28562       Length:28562       Length:28562       Length:28562      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    vic_race           x_coord_cd        y_coord_cd        latitude    
##  Length:28562       Min.   : 914928   Min.   :125757   Min.   :40.51  
##  Class :character   1st Qu.:1000068   1st Qu.:182912   1st Qu.:40.67  
##  Mode  :character   Median :1007772   Median :194901   Median :40.70  
##                     Mean   :1009424   Mean   :208380   Mean   :40.74  
##                     3rd Qu.:1016807   3rd Qu.:239814   3rd Qu.:40.82  
##                     Max.   :1066815   Max.   :271128   Max.   :40.91  
##                                                        NA's   :59     
##    longitude        lon_lat         
##  Min.   :-74.25   Length:28562      
##  1st Qu.:-73.94   Class :character  
##  Median :-73.92   Mode  :character  
##  Mean   :-73.91                     
##  3rd Qu.:-73.88                     
##  Max.   :-73.70                     
##  NA's   :59
shooting_raw_df %>% 
    glimpse()
## Rows: 28,562
## Columns: 21
## $ incident_key            <int> 244608249, 247542571, 84967535, 202853370, 270…
## $ occur_date              <date> 2022-05-05, 2022-07-04, 2012-05-27, 2019-09-2…
## $ occur_time              <time> 00:10:00, 22:20:00, 19:35:00, 21:00:00, 21:00…
## $ boro                    <chr> "MANHATTAN", "BRONX", "QUEENS", "BRONX", "BROO…
## $ loc_of_occur_desc       <chr> "INSIDE", "OUTSIDE", NA, NA, NA, NA, NA, NA, N…
## $ precinct                <int> 14, 48, 103, 42, 83, 23, 113, 77, 48, 49, 73, …
## $ jurisdiction_code       <int> 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ loc_classfctn_desc      <chr> "COMMERCIAL", "STREET", NA, NA, NA, NA, NA, NA…
## $ location_desc           <chr> "VIDEO STORE", "(null)", NA, NA, NA, "MULTI DW…
## $ statistical_murder_flag <lgl> TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, …
## $ perp_age_group          <chr> "25-44", "(null)", NA, "25-44", "25-44", NA, N…
## $ perp_sex                <chr> "M", "(null)", NA, "M", "M", NA, NA, NA, NA, "…
## $ perp_race               <chr> "BLACK", "(null)", NA, "UNKNOWN", "BLACK", NA,…
## $ vic_age_group           <chr> "25-44", "18-24", "18-24", "25-44", "25-44", "…
## $ vic_sex                 <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "…
## $ vic_race                <chr> "BLACK", "BLACK", "BLACK", "BLACK", "BLACK", "…
## $ x_coord_cd              <dbl> 986050, 1016802, 1048632, 1014493, 1009149, 99…
## $ y_coord_cd              <dbl> 214231.0, 250581.0, 198262.0, 242565.0, 190104…
## $ latitude                <dbl> 40.75469, 40.85440, 40.71063, 40.83242, 40.688…
## $ longitude               <dbl> -73.99350, -73.88233, -73.76777, -73.89071, -7…
## $ lon_lat                 <chr> "POINT (-73.9935 40.754692)", "POINT (-73.8823…

A few items we will need to address in cleaning:

shooting_raw_df %>% 
    select(boro, ends_with("_age_group"), ends_with("_sex"), ends_with("_race")) %>% 
    pivot_longer(everything(), names_to = "variable", values_to = "value") %>% 
    ggplot(aes(x = value)) +
    geom_bar() +
    facet_wrap(facets = vars(variable), nrow = 2, scales = "free_x") +
    xlab(NULL) +
    ylab(NULL) +
    theme(axis.text.x.bottom = element_text(angle = 45, hjust = 1, size = 6))

shooting_clean_df <- 
    shooting_raw_df %>% 
    mutate(
        occur_datetime = lubridate::make_datetime(
            year = year(occur_date),
            month = month(occur_date),
            day = day(occur_date),
            hour = hour(occur_time),
            min = minute(occur_time),
            sec = second(occur_time),
            tz = "America/New_York"
        ),
        boro = 
            boro %>% 
            as_factor(),
        across(
            ends_with("_age_group"),
            ~ coalesce(., "UNKNOWN") %>% 
                as_factor() %>% 
                fct_collapse(UNKNOWN = c("UNKNOWN", "(null)")) %>% 
                fct_relevel("<18", "18-24", "25-44", "45-64", "65+")
        ),
        across(
            c(ends_with("_sex"), ends_with("_race")),
            ~ coalesce(., "U") %>%
                as_factor() %>% 
                fct_collapse(U = c("U", "(null)"))
        )
    )

Background

Crime, especially in large cities like New York, has been a hot topic in American political discourse lately. In New York’s case, it’s common to see news coverage (particularly from media outlets broadly considered conservative) citing a widespread malaise among New York residents about the levels of violent crime in the city.

Meanwhile, other coverage (usually from media outlets broadly considered liberal) cite declining levels of violent crime since the pandemic.

With all of this in mind, in order to better understand trends in violent crime NYC, particularly during and after the 2020 COVID pandemic, I aim to analyze the rates of shootings in the provided dataset over time, with particular attention paid to the time period from 2017 to the present. Additionally, I aim to identify any differences in crime rates by geography and look for correlations with voting patterns from the 2020 presidential election, which may give a more detailed picture of how violent crime is evolving and suggest how different communities may be perceiving this crime.

Analysis

Shootings Over Time

Below, we show a time series of the number of shootings per month.

The data appear highly seasonal within the year, so we also show the trend component of an STL decomposition as the “seasonally adjusted” number of shootings.

shootings_monthly_df <- 
    shooting_clean_df %>% 
    mutate(occur_month = lubridate::floor_date(occur_datetime, unit = "month")) %>% 
    group_by(occur_month) %>% 
    summarize(num_incidents = n())

stl_model_spec <- 
    seasonal_reg(mode = "regression", seasonal_period_1 = "1 year") %>% 
    set_engine("stlm_arima")

stl_model_fit <- 
    stl_model_spec %>% 
    fit(num_incidents ~ occur_month, data = shootings_monthly_df)


shootings_monthly_df %>% 
    mutate(trend = stl_model_fit$fit$models$model_1$stl[,"Trend"] %>% as.numeric()) %>% 
    pivot_longer(cols = c(num_incidents, trend), names_to = "variable", values_to = "value") %>% 
    ggplot(aes(x = occur_month)) +
    geom_line(aes(y = value, color = variable)) +
    geom_hline(yintercept = 0, color = "white", size = 1.5) +
    scale_x_datetime(breaks = breaks_width(width = "2 year"), labels = label_date(format = "%Y")) +
    scale_color_manual(
        name = NULL, 
        values = c("num_incidents" = "black", "trend" = "red"),
        labels = c("num_incidents" = "Observed", "trend" = "Seasonally Adj."),
    ) +
    labs(
        title = "Monthly Shooting Incidents in NYC",
        subtitle = "Shootings near 15-year low following a sharp rise during COVID"
    ) +
    xlab(NULL) +
    ylab("No. of Incidents") +
    theme(legend.position = "bottom")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

seasonal_df <- 
    shootings_monthly_df %>% 
    mutate(
        seasonal = stl_model_fit$fit$models$model_1$stl[,"Seasonal12"] %>% as.numeric(),
        month = month(occur_month) %>% as.integer(),
        month_label = month(occur_month, label = TRUE),
        occur_year = year(occur_month)
    )


seasonal_df %>% 
    ggplot(aes(x = occur_year, y = seasonal)) +
    geom_hline(yintercept = 0, color = "white", size = 1.5) +
    geom_line(aes(color = month, group = month)) +
    geom_point(aes(color = month)) +
    ggrepel::geom_text_repel(
        aes(x = occur_year, y = seasonal, label = month_label, color = month), 
        data = filter(seasonal_df, occur_year == max(occur_year)),
        nudge_x = 1,
        segment.color = NA
    ) +
    scale_x_continuous(breaks = breaks_width(2)) +
    scale_y_continuous(breaks = breaks_width(20)) +
    scale_color_viridis_c() +
    xlab("Year") +
    ylab("Seasonal Component") +
    labs(
        title = "Seasonal Component, Monthly Shootings",
        subtitle = "Shootings peak in the summer months",
    ) +
    theme(legend.position = "none")

What’s clear from these visualizations is that shooting incidents came down steadily from 2006 to 2020, spiked during the COVID pandemic, and have fallen more or less to pre-pandemic levels since.

Correlation with Geography and Politics

The Upshot at the New York Times has compiled extremely detailed election results for the 2020 presidential election, which includes precinct-level data for the five boroughs of New York City. Using this data, we can correlate voting patterns and shooting incidents within smaller geographies in the city. See the GitHub link above for more information. For the purposes of this analysis, the relevant fields in the data are

  • votes_dem: the number of votes for the Democratic candidate (Joe Biden)
  • votes_rep: the number of votes for the Republican candidate (Donald Trump)
  • votes_total: the total number of votes cast for president in the precinct
  • pct_dem_lead: the margin by which the Democratic candidate led the Republican candidate in that precinct (calculated as (votes_dem - votes_rep) / votes_total)
nyc_borough_fips_codes <- c(
    "36061",  # Manhattan
    "36005",  # The Bronx
    "36047",  # Brooklyn
    "36081",  # Queens
    "36085"   # Staten Island
)

nyc_election_results <- 
    ## Downloaded from 
    ## https://int.nyt.com/newsgraphics/elections/map-data/2020/national/precincts-with-results.geojson.gz
    here::here("data/election-results.geojson") %>% 
    geojsonsf::geojson_sf() %>% 
    janitor::clean_names() %>% 
    mutate(fips_code = str_split_i(geoid, "-", 1)) %>% 
    filter(fips_code %in% nyc_borough_fips_codes)

By Precinct

Below, we plot the difference in incidents per year for the periods 2020-2021 (the pandemic) and 2016-2019 (pre-pandemic) by voting precinct. There appears to be very little correlation between presidential election results at the precinct level and changes in the rate of shooting incidents.

A simple linear model of this difference as a function of pct_dem_lead shows a statistically significant but infinitesimal positive correlation.

sf_use_s2(FALSE)

shooting_sf <- 
    shooting_clean_df %>% 
    mutate(location_geo = purrr::map2(longitude, latitude, ~ st_point(x = c(.x, .y)))) %>% 
    st_as_sf() %>% 
    st_set_crs(4326)

shooting_precinct <- 
    nyc_election_results %>%
    st_join(
        shooting_sf,
        join = st_intersects
    ) %>% 
    as_tibble() %>% 
    select(incident_key, geoid, occur_datetime)

shootings_2016_2019_by_precinct <- 
    shooting_precinct %>% 
    filter(year(occur_datetime) %in% 2016:2019) %>% 
    group_by(geoid) %>% 
    summarize(n_incidents_2016_2019 = sum(!is.na(incident_key)))
    
shootings_2020_2021_by_precinct <- 
    shooting_precinct %>% 
    filter(year(occur_datetime) %in% 2020:2021) %>% 
    group_by(geoid) %>% 
    summarize(n_incidents_2020_2021 = sum(!is.na(incident_key)))

shootings_diff_by_precinct <- 
    nyc_election_results %>% 
    as_tibble() %>% 
    left_join(shootings_2016_2019_by_precinct, by = "geoid") %>% 
    left_join(shootings_2020_2021_by_precinct, by = "geoid") %>% 
    mutate(
        across(starts_with("n_incidents"), ~ coalesce(., 0)),
        diff = n_incidents_2020_2021 / 2 - n_incidents_2016_2019 / 4
    ) 

linear_model <- 
    linear_reg() %>% 
    fit(diff ~ pct_dem_lead, data = shootings_diff_by_precinct)

linear_model %>% 
    broom::tidy()
## # A tibble: 2 × 5
##   term         estimate std.error statistic  p.value
##   <chr>           <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   0.0425   0.0157        2.70 6.91e- 3
## 2 pct_dem_lead  0.00230  0.000232      9.94 4.28e-23
shootings_diff_by_precinct %>% 
    mutate(model_pred = predict(linear_model, new_data = shootings_diff_by_precinct)$.pred) %>% 
    ggplot(aes(pct_dem_lead)) +
    geom_hex(aes(y = diff)) +
    geom_hline(yintercept = 0, size = 1, color = "white") +
    geom_vline(xintercept = 0, size = 1, color = "white") +
    geom_line(aes(y = model_pred), color = "red", size = 0.5) +
    scale_fill_viridis_c(name = "# precincts") +
    labs(
        title = "Change in Shootings/Yr from 2016-19 to 2020-21 vs. Dem. Margin by Precinct",
        subtitle = "Virtually no correlation exists",
        x = "Dem. Margin",
        y = "Shootings/Yr"
    ) +
    theme(
        legend.position = "bottom", 
        legend.key.height = unit(2, "mm"), 
        legend.key.width = unit(2, "cm"),
        legend.title.position = "bottom"
    )

If we look at a map of the data, it’s clear that changes in rates of shootings are highly localized. There are only a handful of precincts where shooting rates shot way up during the pandemic, and even a few precincts where shooting rates went way down, with most precincts staying at roughly the same rates.

make_shooting_label <- function(diff) {
    str_glue("<br>Change in Shootings per Year: {round(diff, 2)}</br>")
}

precinct_shooting_map_data <- 
    shootings_diff_by_precinct %>% 
    left_join(nyc_election_results %>% select(geoid, geometry), by = "geoid") %>% 
    mutate(diff = coalesce(diff, 0)) 

domain <- c(min(precinct_shooting_map_data$diff), max(precinct_shooting_map_data$diff))
lower_palette <- colorRampPalette(c("blue", "#eeeeff"), space = "Lab")(abs(domain[1]))
upper_palette <- colorRampPalette(c("#ffefef", "orange"), space = "Lab")(domain[2])
shooting_palette <- c(lower_palette, upper_palette)

precinct_shooting_map_data %>% 
    st_as_sf() %>% 
    leaflet(options = leafletOptions(minZoom = 10)) %>% 
    addTiles() %>% 
    addPolygons(
        weight = 1, 
        color = ~get('colorBin')(shooting_palette, domain)(diff), 
        fillOpacity = 0.5,
        label = ~map(make_shooting_label(diff), htmltools::HTML)
    ) %>% 
    addLegend(position = "bottomright", pal = colorBin(shooting_palette, domain), values = domain)

By Neighborhood

If we examine on the neighborhood level, we can see a much stronger correlation. Neighborhood names and boundaries used here are the ones provided by NYC OpenData. Areas in northern Manhattan, the Bronx, and Eastern Brooklyn seemed to experience the largest increases in shootings during the pandemic, whereas many areas of Staten Island and Queens saw decreases in shootings.

neighborhoods <- 
    # Downloaded from https://data.cityofnewyork.us/City-Government/Neighborhood-Names-GIS/99bc-9p23
    here::here("data/neighborhood-boundaries.geojson") %>% 
    geojsonsf::geojson_sf() %>% 
    janitor::clean_names()

shooting_diff_by_neighborhood <- 
    neighborhoods %>% 
    st_join(shootings_diff_by_precinct %>% st_as_sf(), join = st_intersects) %>% 
    as_tibble() %>% 
    select(-votes_per_sqkm) %>% 
    group_by(ntacode, ntaname, geometry) %>% 
    summarize(
        across(c(starts_with("n_incidents"), starts_with("votes_")), sum)
    ) %>% 
    ungroup() %>% 
    mutate(
        diff = n_incidents_2020_2021 / 2 - n_incidents_2016_2019 / 4,
        pct_dem_lead = 100 * (votes_dem - votes_rep) / votes_total
    )

neighborhood_linear_model <- 
    linear_reg() %>% 
    fit(diff ~ pct_dem_lead, data = shooting_diff_by_neighborhood)

neighborhood_linear_model %>% 
    broom::tidy()
## # A tibble: 2 × 5
##   term         estimate std.error statistic  p.value
##   <chr>           <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)     1.55     1.13        1.37 1.71e- 1
## 2 pct_dem_lead    0.142    0.0184      7.72 6.10e-13
shooting_diff_by_neighborhood %>% 
    mutate(model_pred = predict(neighborhood_linear_model, new_data = shooting_diff_by_neighborhood)$.pred) %>% 
    ggplot(aes(pct_dem_lead)) +
    geom_hex(aes(y = diff)) +
    geom_hline(yintercept = 0, size = 1, color = "white") +
    geom_vline(xintercept = 0, size = 1, color = "white") +
    geom_line(aes(y = model_pred), color = "red", size = 0.5) +
    scale_fill_viridis_c(name = "# neighborhoods") +
    labs(
        title = "Change in Shootings/Yr from 2016-19 to 2020-21 vs. Dem. Margin by Neighborhood",
        subtitle = "Increased Dem Support Correlated with Increased Shootings",
        x = "Dem. Margin",
        y = "Shootings/Yr"
    ) +
    theme(
        legend.position = "bottom", 
        legend.key.height = unit(2, "mm"), 
        legend.key.width = unit(2, "cm"),
        legend.title.position = "bottom"
    )

domain <- c(min(shooting_diff_by_neighborhood$diff), max(shooting_diff_by_neighborhood$diff))
lower_palette <- colorRampPalette(c("blue", "#eeeeff"), space = "Lab")(abs(domain[1]))
upper_palette <- colorRampPalette(c("#ffefef", "orange"), space = "Lab")(domain[2])
shooting_palette <- c(lower_palette, upper_palette)

make_neighborhood_label <- function(votes_dem, votes_rep, pct_dem_lead, name) {
    margin <- 
        if_else(
            pct_dem_lead >= 0, 
            str_glue("D+{round(pct_dem_lead, 1)}%"), 
            str_glue("R+{round(-pct_dem_lead, 1)}%")
        )
    style <- if_else(
        pct_dem_lead >= 0,
        "color:blue;",
        "color:red;"
    )
    str_glue("<h3>{name}</h3><p>Votes for Biden: {votes_dem}</p><p>Votes for Trump: {votes_rep}</p><p><b>Margin: <span style={style}>{margin}<span></b></p>")
}

shooting_diff_by_neighborhood %>% 
    sf::st_as_sf() %>% 
    leaflet() %>% 
    addTiles() %>% 
    addPolygons(    
        weight = 1,
        color = "black",
        fillColor = ~get('colorBin')(shooting_palette, domain)(diff), 
        fillOpacity = 0.5,
        label = ~map(make_neighborhood_label(votes_dem, votes_rep, pct_dem_lead, ntaname), htmltools::HTML)
    )

It’s particularly interesting to note here how much the results of this analysis can change depending on the grain of the geography chosen, which emphasizes just how localized a phenomenon crime really is.

I think what we can say confidently is that discussing “levels of crime” in a geography as large as New York City is a fraught exercise. The rates of violence taking place have varied on a scale as small as a few blocks. For this reason, it’s difficult to say exactly how, if at all, shooting rates affected voting patterns in New York.

Possible Sources of Bias and Areas for Improvment

With crime data, there are plenty of potential sources of bias. These are a couple that come to mind, but domain experts will surely be able to identify more:

Additionally, my own personal biases certainly affect the analysis here. Crime is a heavily politicized topic of study, and my decision to analyze crime in New York through the lens of politics was definitely motivated by my personal views My perception of crime in New York is also colored by my personal experiences there - I lived in Hell’s Kitchen for a year and a half from 2021-2022. In order to mitigate these sources of bias, it was important to avoid jumping to conclusions and only draw inferences evidenced in the data.

Finally, analyzing only violent crime leaves some holes in this analysis. It’s not clear how non-violent crimes, like theft, have evolved over this time period and may also contribute to political perceptions of crime in New York.